From dfbc3da708ad43096a06634bdef4a1bbf19a5b76 Mon Sep 17 00:00:00 2001 From: "kaf24@scramble.cl.cam.ac.uk" Date: Sat, 17 Jul 2004 13:01:38 +0000 Subject: [PATCH] bitkeeper revision 1.1104.1.1 (40f923322G2jO4f0TVh9AXW3jpr9bQ) Initial Xen support for 4GB segments thru instruction emulation. The instruction decoder needs some refactoring as there is lots of duplicated crufty code in there right now. Also, the TLS libraries hit the emulator a LOT, but mainly with one or two instructions. Probably we need to patch those within Linux. --- .rootkeys | 1 + linux-2.4.26-xen-sparse/arch/xen/kernel/ldt.c | 208 +++---- .../arch/xen/i386/kernel/ldt.c | 17 +- .../arch/xen/i386/kernel/process.c | 24 - xen/arch/x86/dom0_ops.c | 3 +- xen/arch/x86/memory.c | 2 +- xen/arch/x86/traps.c | 16 +- xen/arch/x86/x86_32/emulate.c | 553 ++++++++++++++++++ xen/arch/x86/x86_32/mm.c | 36 +- xen/include/asm-x86/desc.h | 1 + xen/include/asm-x86/mm.h | 4 +- xen/include/asm-x86/processor.h | 20 +- xen/include/xen/perfc_defn.h | 2 + 13 files changed, 706 insertions(+), 181 deletions(-) create mode 100644 xen/arch/x86/x86_32/emulate.c diff --git a/.rootkeys b/.rootkeys index b5b50680da..19a5f1a5ba 100644 --- a/.rootkeys +++ b/.rootkeys @@ -446,6 +446,7 @@ 3ddb79bccYVzXZJyVaxuv5T42Z1Fsw xen/arch/x86/trampoline.S 3ddb79bcOftONV9h4QCxXOfiT0h91w xen/arch/x86/traps.c 3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/x86/x86_32/domain_page.c +40f92331jfOlE7MfKwpdkEb1CEf23g xen/arch/x86/x86_32/emulate.c 3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/x86/x86_32/entry.S 3ddb79bcHwuCQDjBICDTSis52hWguw xen/arch/x86/x86_32/mm.c 3ddb79bc4nTpGQOe6_-MbyZzkhlhFQ xen/arch/x86/x86_32/usercopy.c diff --git a/linux-2.4.26-xen-sparse/arch/xen/kernel/ldt.c b/linux-2.4.26-xen-sparse/arch/xen/kernel/ldt.c index 6a2bd7a0d9..b0613a17b9 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/ldt.c +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/ldt.c @@ -156,132 +156,116 @@ static int read_ldt(void * ptr, unsigned long bytecount) return bytecount; } - static int read_default_ldt(void * ptr, unsigned long bytecount) { - int err; - unsigned long size; - void *address; + int err; + unsigned long size; + void *address; - err = 0; - address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; + err = 0; + address = &default_ldt[0]; + size = 5*sizeof(struct desc_struct); + if (size > bytecount) + size = bytecount; - err = size; - if (copy_to_user(ptr, address, size)) - err = -EFAULT; + err = size; + if (copy_to_user(ptr, address, size)) + err = -EFAULT; - return err; + return err; } static int write_ldt(void * ptr, unsigned long bytecount, int oldmode) { - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2, *lp; - unsigned long phys_lp, max_limit; - int error; - struct modify_ldt_ldt_s ldt_info; - - error = -EINVAL; - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - /* - * This makes our tests for overlap with Xen space easier. There's no good - * reason to have a user segment starting this high anyway. - */ - if (ldt_info.base_addr >= PAGE_OFFSET) - goto out; - - down(&mm->context.sem); - if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - - lp = (__u32 *)((ldt_info.entry_number<<3) + (char *)mm->context.ldt); - phys_lp = arbitrary_virt_to_phys(lp); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || - (ldt_info.contents == 0 && - ldt_info.read_exec_only == 1 && - ldt_info.seg_32bit == 0 && - ldt_info.limit_in_pages == 0 && - ldt_info.seg_not_present == 1 && - ldt_info.useable == 0 )) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - max_limit = HYPERVISOR_VIRT_START - ldt_info.base_addr; - if ( ldt_info.limit_in_pages ) - max_limit >>= PAGE_SHIFT; - max_limit--; - if ( (ldt_info.limit & 0xfffff) > (max_limit & 0xfffff) ) - ldt_info.limit = max_limit; + struct mm_struct * mm = current->mm; + __u32 entry_1, entry_2, *lp; + unsigned long phys_lp; + int error; + struct modify_ldt_ldt_s ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } - entry_1 = ((ldt_info.base_addr & 0x0000ffff) << 16) | - (ldt_info.limit & 0x0ffff); - entry_2 = (ldt_info.base_addr & 0xff000000) | - ((ldt_info.base_addr & 0x00ff0000) >> 16) | - (ldt_info.limit & 0xf0000) | - ((ldt_info.read_exec_only ^ 1) << 9) | - (ldt_info.contents << 10) | - ((ldt_info.seg_not_present ^ 1) << 15) | - (ldt_info.seg_32bit << 22) | - (ldt_info.limit_in_pages << 23) | - 0x7000; - if (!oldmode) - entry_2 |= (ldt_info.useable << 20); + down(&mm->context.sem); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } - /* Install the new entry ... */ - install: - error = HYPERVISOR_update_descriptor(phys_lp, entry_1, entry_2); + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + phys_lp = arbitrary_virt_to_phys(lp); + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || + (ldt_info.contents == 0 && + ldt_info.read_exec_only == 1 && + ldt_info.seg_32bit == 0 && + ldt_info.limit_in_pages == 0 && + ldt_info.seg_not_present == 1 && + ldt_info.useable == 0 )) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } - out_unlock: - up(&mm->context.sem); - out: - return error; + entry_1 = ((ldt_info.base_addr & 0x0000ffff) << 16) | + (ldt_info.limit & 0x0ffff); + entry_2 = (ldt_info.base_addr & 0xff000000) | + ((ldt_info.base_addr & 0x00ff0000) >> 16) | + (ldt_info.limit & 0xf0000) | + ((ldt_info.read_exec_only ^ 1) << 9) | + (ldt_info.contents << 10) | + ((ldt_info.seg_not_present ^ 1) << 15) | + (ldt_info.seg_32bit << 22) | + (ldt_info.limit_in_pages << 23) | + 0x7000; + if (!oldmode) + entry_2 |= (ldt_info.useable << 20); + + /* Install the new entry ... */ +install: + error = HYPERVISOR_update_descriptor(phys_lp, entry_1, entry_2); + +out_unlock: + up(&mm->context.sem); +out: + return error; } asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) { - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; } diff --git a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/ldt.c b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/ldt.c index 8b4b77e1f7..d243b3a766 100644 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/ldt.c +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/ldt.c @@ -182,7 +182,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) { struct mm_struct * mm = current->mm; __u32 entry_1, entry_2, *lp; - unsigned long phys_lp, max_limit; + unsigned long phys_lp; int error; struct user_desc ldt_info; @@ -203,14 +203,6 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) goto out; } - /* - * This makes our tests for overlap with Xen space - * easier. There's no good reason to have a user segment - * starting this high anyway. - */ - if (ldt_info.base_addr >= PAGE_OFFSET) - goto out; - down(&mm->context.sem); if (ldt_info.entry_number >= mm->context.size) { error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); @@ -230,13 +222,6 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) } } - max_limit = HYPERVISOR_VIRT_START - ldt_info.base_addr; - if (ldt_info.limit_in_pages) - max_limit >>= PAGE_SHIFT; - max_limit--; - if ((ldt_info.limit & 0xfffff) > (max_limit & 0xfffff)) - ldt_info.limit = max_limit; - entry_1 = LDT_entry_a(&ldt_info); entry_2 = LDT_entry_b(&ldt_info); if (oldmode) diff --git a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/process.c b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/process.c index 320b181f8c..1d7f637a66 100644 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/process.c +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/process.c @@ -331,24 +331,6 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -/* NB. This Xen-specific function is inlined in 'write_ldt'. */ -static int truncate_user_desc(struct user_desc *info) -{ - unsigned long max_limit; - - if (info->base_addr >= PAGE_OFFSET) - return 0; - - max_limit = HYPERVISOR_VIRT_START - info->base_addr; - if (info->limit_in_pages) - max_limit >>= PAGE_SHIFT; - max_limit--; - if ((info->limit & 0xfffff) > (max_limit & 0xfffff)) - info->limit = max_limit; - - return 1; -} - int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) @@ -399,9 +381,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) goto out; - if (!truncate_user_desc(&info)) - goto out; - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; desc->a = LDT_entry_a(&info); desc->b = LDT_entry_b(&info); @@ -717,9 +696,6 @@ asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) return -EFAULT; idx = info.entry_number; - if (!truncate_user_desc(&info)) - return -EINVAL; - /* * index -1 means the kernel should try to find and * allocate an empty descriptor: diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c index 5df4c568b4..6472a93252 100644 --- a/xen/arch/x86/dom0_ops.c +++ b/xen/arch/x86/dom0_ops.c @@ -133,8 +133,7 @@ void arch_getdomaininfo_ctxt(struct domain *d, full_execution_context_t *c) for ( i = 0; i < 16; i++ ) c->gdt_frames[i] = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]); - c->gdt_ents = - (GET_GDT_ENTRIES(d) + 1) >> 3; + c->gdt_ents = GET_GDT_ENTRIES(d); } c->guestos_ss = d->thread.guestos_ss; c->guestos_esp = d->thread.guestos_sp; diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c index a2cef38ed2..f40c1688cb 100644 --- a/xen/arch/x86/memory.c +++ b/xen/arch/x86/memory.c @@ -178,7 +178,7 @@ int alloc_segdesc_page(struct pfn_info *page) int i; for ( i = 0; i < 512; i++ ) - if ( unlikely(!check_descriptor(descs[i*2], descs[i*2+1])) ) + if ( unlikely(!check_descriptor(&descs[i*2])) ) goto fail; unmap_domain_mem(descs); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index aa74ae4dfe..8de5573d27 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -51,16 +51,7 @@ #include #include -#define GTBF_TRAP 1 -#define GTBF_TRAP_NOCODE 2 -#define GTBF_TRAP_CR2 4 -struct guest_trap_bounce { - unsigned long error_code; /* 0 */ - unsigned long cr2; /* 4 */ - unsigned short flags; /* 8 */ - unsigned short cs; /* 10 */ - unsigned long eip; /* 12 */ -} guest_trap_bounce[NR_CPUS] = { { 0 } }; +struct guest_trap_bounce guest_trap_bounce[NR_CPUS] = { { 0 } }; #if defined(__i386__) @@ -451,6 +442,11 @@ asmlinkage void do_general_protection(struct pt_regs *regs, long error_code) goto finish_propagation; } } + +#if defined(__i386__) + if ( (error_code == 0) && gpf_emulate_4gb(regs) ) + return; +#endif /* Pass on GPF as is. */ ti = current->thread.traps + 13; diff --git a/xen/arch/x86/x86_32/emulate.c b/xen/arch/x86/x86_32/emulate.c new file mode 100644 index 0000000000..bd0f4c8f62 --- /dev/null +++ b/xen/arch/x86/x86_32/emulate.c @@ -0,0 +1,553 @@ +/****************************************************************************** + * arch/x86/x86_32/emulate.c + * + * Emulation of certain classes of IA32 instruction. Used to emulate 4GB + * segments, for example. + * + * Copyright (c) 2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int get_baselimit(u16 seg, unsigned long *base, unsigned long *limit) +{ + struct domain *d = current; + unsigned long *table, a, b; + int ldt = !!(seg & 4); + int idx = (seg >> 3) & 8191; + + /* Get base and check limit. */ + if ( ldt ) + { + table = (unsigned long *)LDT_VIRT_START; + if ( idx >= d->mm.ldt_ents ) + goto fail; + } + else /* gdt */ + { + table = (unsigned long *)GET_GDT_ADDRESS(d); + if ( idx >= GET_GDT_ENTRIES(d) ) + goto fail; + } + + /* Grab the segment descriptor. */ + if ( __get_user(a, &table[2*idx+0]) || + __get_user(b, &table[2*idx+1]) ) + goto fail; /* Barking up the wrong tree. Decode needs a page fault.*/ + + /* We only parse 32-bit code and data segments. */ + if ( (b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB)) != + (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB) ) + goto fail; + + /* Decode base and limit. */ + *base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16); + *limit = ((b & 0xf0000) | (a & 0x0ffff)) + 1; + if ( (b & _SEGMENT_G) ) + *limit <<= 12; + + /* + * Anything that looks like a truncated segment we assume ought really + * to be a 4GB segment. DANGER! + */ + if ( (PAGE_OFFSET - (*base + *limit)) < PAGE_SIZE ) + *limit = 0; + + return 1; + + fail: + return 0; +} + +int linearise_address(u16 seg, unsigned long off, unsigned long *linear) +{ + unsigned long base, limit; + + if ( !get_baselimit(seg, &base, &limit) ) + return 0; + + if ( off > (limit-1) ) + return 0; + + *linear = base + off; + + return 1; +} + +void *decode_reg(struct pt_regs *regs, u8 b) +{ + switch ( b & 7 ) + { + case 0: return ®s->eax; + case 1: return ®s->ecx; + case 2: return ®s->edx; + case 3: return ®s->ebx; + case 4: return ®s->esp; + case 5: return ®s->ebp; + case 6: return ®s->esi; + case 7: return ®s->edi; + } + + return NULL; +} + +/* + * Decode an effective address: + * @ppb (IN/OUT): IN == address of ModR/M byte; OUT == byte following EA. + * @preg (OUT) : address in pt_regs block of the EA register parameter. + * @pmem (OUT) : address of the EA memory parameter. + * @pseg (IN) : address in pt_regs block of the override segment. + * @regs (IN) : addrress of the the pt_regs block. + */ +int decode_effective_address(u8 **ppb, void **preg, void **pmem, + unsigned int *pseg, struct pt_regs *regs) +{ + u8 modrm, mod, reg, rm, *pb = *ppb; + void *memreg, *regreg; + unsigned long ea, limit, offset; + u8 disp8; + u32 disp32 = 0; + + if ( get_user(modrm, pb) ) + { + DPRINTK("Fault while extracting modrm byte\n"); + return 0; + } + + pb++; + + mod = (modrm >> 6) & 3; + reg = (modrm >> 3) & 7; + rm = (modrm >> 0) & 7; + + if ( rm == 4 ) + { + DPRINTK("FIXME: Add decoding for the SIB byte.\n"); + return 0; + } + + /* Decode Reg and R/M fields. */ + regreg = decode_reg(regs, reg); + memreg = decode_reg(regs, rm); + + /* Decode Mod field. */ + switch ( modrm >> 6 ) + { + case 0: + if ( pseg == NULL ) + pseg = ®s->xds; + disp32 = 0; + if ( rm == 5 ) /* disp32 rather than (EBP) */ + { + memreg = NULL; + if ( get_user(disp32, (u32 *)pb) ) + { + DPRINTK("Fault while extracting .\n"); + return 0; + } + pb += 4; + } + break; + + case 1: + if ( pseg == NULL ) /* NB. EBP defaults to SS */ + pseg = (rm == 5) ? ®s->xss : ®s->xds; + if ( get_user(disp8, pb) ) + { + DPRINTK("Fault while extracting .\n"); + return 0; + } + pb++; + disp32 = (disp8 & 0x80) ? (disp8 | ~0xff) : disp8;; + break; + + case 2: + if ( pseg == NULL ) /* NB. EBP defaults to SS */ + pseg = (rm == 5) ? ®s->xss : ®s->xds; + if ( get_user(disp32, (u32 *)pb) ) + { + DPRINTK("Fault while extracting .\n"); + return 0; + } + pb += 4; + break; + + case 3: + DPRINTK("Not a memory operand!\n"); + return 0; + } + + if ( !get_baselimit((u16)(*pseg), &ea, &limit) ) + return 0; + if ( limit != 0 ) + { + DPRINTK("Bailing: not a 4GB data segment.\n"); + return 0; + } + + offset = disp32; + if ( memreg != NULL ) + offset += *(u32 *)memreg; + if ( (offset & 0xf0000000) != 0xf0000000 ) + { + DPRINTK("Bailing: not a -ve offset into 4GB segment.\n"); + return 0; + } + + ea += offset; + if ( ea > (PAGE_OFFSET - PAGE_SIZE) ) + { + DPRINTK("!!!! DISALLOWING UNSAFE ACCESS !!!!\n"); + return 0; + } + + *ppb = pb; + *preg = regreg; + *pmem = (void *)ea; + + return 1; +} + +/* + * Called from the general-protection fault handler to attempt to decode + * and emulate an instruction that depends on 4GB segments. At this point + * we assume that the instruction itself is paged into memory (the CPU + * must have triggered this in order to decode the instruction itself). + */ +int gpf_emulate_4gb(struct pt_regs *regs) +{ + struct domain *d = current; + trap_info_t *ti; + u8 *eip, *nextbyte, b, mb, rb; + u16 mw, rw; + u32 ml, rl, eflags; + unsigned int *pseg = NULL; + int i; + int opsz_override = 0; + void *reg, *mem; + struct guest_trap_bounce *gtb; + + if ( !linearise_address((u16)regs->xcs, regs->eip, (unsigned long *)&eip) ) + { + DPRINTK("Cannot linearise %04x:%08lx\n", regs->xcs, regs->eip); + return 0; + } + + /* Parse prefix bytes. We're basically looking for segment override. */ + for ( i = 0; i < 4; i++ ) + { + if ( get_user(b, &eip[i]) ) + { + DPRINTK("Fault while accessing byte %d of instruction\n", i); + return 0; + } + + switch ( b ) + { + case 0xf0: /* LOCK */ + case 0xf2: /* REPNE/REPNZ */ + case 0xf3: /* REP/REPE/REPZ */ + case 0x67: /* Address-size override */ + DPRINTK("Unhandleable prefix byte %02x\n", b); + goto undecodeable; + case 0x66: /* Operand-size override */ + opsz_override = 1; + break; + case 0x2e: /* CS override */ + pseg = ®s->xcs; + break; + case 0x3e: /* DS override */ + pseg = ®s->xds; + break; + case 0x26: /* ES override */ + pseg = ®s->xes; + break; + case 0x64: /* FS override */ + pseg = ®s->xfs; + break; + case 0x65: /* GS override */ + pseg = ®s->xgs; + break; + case 0x36: /* SS override */ + pseg = ®s->xss; + break; + default: /* Not a prefix byte */ + goto done_prefix; + } + } + done_prefix: + + nextbyte = &eip[i+1]; + if ( !decode_effective_address(&nextbyte, ®, &mem, pseg, regs) ) + goto undecodeable; + + /* Only handle single-byte opcodes right now. Sufficient for MOV. */ + /* + * XXX Now I see how this decode routine is panning out, it needs + * refactoring. Lots of duplicated cruft in here... + */ + switch ( b ) + { + case 0x88: /* movb r,r/m */ + if ( __put_user(*(u8 *)reg, (u8 *)mem) ) + goto page_fault_w; + regs->eip += nextbyte - eip; + break; + case 0x89: /* movl r,r/m */ + if ( opsz_override ) + { + if ( __put_user(*(u16 *)reg, (u16 *)mem) ) + goto page_fault_w; + } + else + { + if ( __put_user(*(u32 *)reg, (u32 *)mem) ) + goto page_fault_w; + } + regs->eip += nextbyte - eip; + break; + case 0x8a: /* movb r/m,r */ + if ( __get_user(*(u8 *)reg, (u8 *)mem) ) + goto page_fault_r; + regs->eip += nextbyte - eip; + break; + case 0x8b: /* movl r/m,r */ + if ( opsz_override ) + { + if ( __get_user(*(u16 *)reg, (u16 *)mem) ) + goto page_fault_r; + } + else + { + if ( __get_user(*(u32 *)reg, (u32 *)mem) ) + goto page_fault_r; + } + regs->eip += nextbyte - eip; + break; + case 0xc6: /* movb imm,r/m */ + if ( reg != ®s->eax ) /* Reg == /0 */ + goto undecodeable; + if ( get_user(rb, nextbyte) ) + { + DPRINTK("Fault while extracting immediate byte\n"); + return 0; + } + if ( __put_user(rb, (u8 *)mem) ) + goto page_fault_w; + regs->eip += nextbyte - eip + 1; + break; + case 0xc7: /* movl imm,r/m */ + if ( reg != ®s->eax ) /* Reg == /0 */ + goto undecodeable; + if ( opsz_override ) + { + if ( get_user(rw, (u16 *)nextbyte) ) + { + DPRINTK("Fault while extracting immediate word\n"); + return 0; + } + if ( __put_user(rw, (u16 *)mem) ) + goto page_fault_w; + regs->eip += nextbyte - eip + 2; + } + else + { + if ( get_user(rl, (u32 *)nextbyte) ) + { + DPRINTK("Fault while extracting immediate longword\n"); + return 0; + } + if ( __put_user(rl, (u32 *)mem) ) + goto page_fault_w; + regs->eip += nextbyte - eip + 4; + } + break; + case 0x80: /* cmpb imm8,r/m */ + if ( reg != ®s->edi ) /* Reg == /7 */ + goto undecodeable; + if ( get_user(rb, nextbyte) ) + { + DPRINTK("Fault while extracting immediate byte\n"); + return 0; + } + if ( __get_user(mb, (u8 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpb %b1,%b2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" (rb), "b" (mb) ); + regs->eflags &= ~0x8d5; /* OF,SF,ZF,AF,PF,CF */ + regs->eflags |= eflags & 0x8d5; + regs->eip += nextbyte - eip + 1; + break; + case 0x81: /* cmpl imm32,r/m */ + if ( reg != ®s->edi ) /* Reg == /7 */ + goto undecodeable; + if ( opsz_override ) + { + if ( get_user(rw, (u16 *)nextbyte) ) + { + DPRINTK("Fault while extracting immediate word\n"); + return 0; + } + if ( __get_user(mw, (u16 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpw %w1,%w2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" (rw), "b" (mw) ); + regs->eip += nextbyte - eip + 2; + } + else + { + if ( get_user(rl, (u32 *)nextbyte) ) + { + DPRINTK("Fault while extracting immediate longword\n"); + return 0; + } + if ( __get_user(ml, (u32 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpl %1,%2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" (rl), "b" (ml) ); + regs->eip += nextbyte - eip + 4; + } + regs->eflags &= ~0x8d5; /* OF,SF,ZF,AF,PF,CF */ + regs->eflags |= eflags & 0x8d5; + break; + case 0x83: /* cmpl imm8,r/m */ + if ( reg != ®s->edi ) /* Reg == /7 */ + goto undecodeable; + if ( get_user(rb, nextbyte) ) + { + DPRINTK("Fault while extracting immediate byte\n"); + return 0; + } + if ( opsz_override ) + { + rw = (rb & 0x80) ? (rb | ~0xff) : rb; + if ( __get_user(mw, (u16 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpw %w1,%w2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" (rw), "b" (mw) ); + regs->eip += nextbyte - eip + 2; + } + else + { + rl = (rb & 0x80) ? (rb | ~0xff) : rb; + if ( __get_user(ml, (u32 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpl %1,%2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" (rl), "b" (ml) ); + } + regs->eflags &= ~0x8d5; /* OF,SF,ZF,AF,PF,CF */ + regs->eflags |= eflags & 0x8d5; + regs->eip += nextbyte - eip + 1; + break; + case 0x38: /* cmpb r,r/m */ + case 0x3a: /* cmpb r/m,r */ + rb = *(u8 *)reg; + if ( __get_user(mb, (u8 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpb %b1,%b2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" ((b==0x38)?rb:mb), "b" ((b==0x38)?mb:rb) ); + regs->eflags &= ~0x8d5; /* OF,SF,ZF,AF,PF,CF */ + regs->eflags |= eflags & 0x8d5; + regs->eip += nextbyte - eip; + break; + case 0x39: /* cmpl r,r/m */ + case 0x3b: /* cmpl r/m,r */ + if ( opsz_override ) + { + rw = *(u16 *)reg; + if ( __get_user(mw, (u16 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpw %w1,%w2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" ((b==0x38)?rw:mw), "b" ((b==0x38)?mw:rw) ); + } + else + { + rl = *(u32 *)reg; + if ( __get_user(ml, (u32 *)mem) ) + goto page_fault_r; + __asm__ __volatile__ ( + "cmpl %1,%2 ; pushf ; popl %0" + : "=a" (eflags) + : "0" ((b==0x38)?rl:ml), "b" ((b==0x38)?ml:rl) ); + } + regs->eflags &= ~0x8d5; /* OF,SF,ZF,AF,PF,CF */ + regs->eflags |= eflags & 0x8d5; + regs->eip += nextbyte - eip; + break; + default: + DPRINTK("Unhandleable opcode byte %02x\n", b); + goto undecodeable; + } + + perfc_incrc(emulations); + + /* Success! */ + return 1; + + undecodeable: + printk("Undecodable instruction %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x " + "caused GPF(0) at %04x:%08lx\n", + eip[0], eip[1], eip[2], eip[3], + eip[4], eip[5], eip[6], eip[7], + regs->xcs, regs->eip); + return 0; + + page_fault_w: + ti = &d->thread.traps[14]; + gtb = &guest_trap_bounce[d->processor]; + /* + * XXX We don't distinguish between page-not-present and read-only. + * Linux doesn't care, but this might need fixing if others do. + */ + gtb->error_code = 6; /* user fault, write access, page not present */ + goto page_fault_common; + page_fault_r: + ti = &d->thread.traps[14]; + gtb = &guest_trap_bounce[d->processor]; + gtb->error_code = 4; /* user fault, read access, page not present */ + page_fault_common: + gtb->flags = GTBF_TRAP_CR2; + gtb->cr2 = (unsigned long)mem; + gtb->cs = ti->cs; + gtb->eip = ti->address; + if ( TI_GET_IF(ti) ) + d->shared_info->vcpu_data[0].evtchn_upcall_mask = 1; + return 1; +} + + diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c index d740206108..a77ad89e2b 100644 --- a/xen/arch/x86/x86_32/mm.c +++ b/xen/arch/x86/x86_32/mm.c @@ -1,7 +1,7 @@ /****************************************************************************** - * arch/i386/mm.c + * arch/x86/x86_32/mm.c * - * Modifications to Linux original are copyright (c) 2002-2003, K A Fraser + * Modifications to Linux original are copyright (c) 2004, K A Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -164,9 +164,9 @@ long do_stack_switch(unsigned long ss, unsigned long esp) /* Returns TRUE if given descriptor is valid for GDT or LDT. */ -int check_descriptor(unsigned long a, unsigned long b) +int check_descriptor(unsigned long *d) { - unsigned long base, limit; + unsigned long base, limit, a = d[0], b = d[1]; /* A not-present descriptor will always fault, so is safe. */ if ( !(b & _SEGMENT_P) ) @@ -211,15 +211,27 @@ int check_descriptor(unsigned long a, unsigned long b) goto good; } - /* Check that base/limit do not overlap Xen-private space. */ + /* Check that base is at least a page away from Xen-private area. */ base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16); + if ( base >= (PAGE_OFFSET - PAGE_SIZE) ) + goto bad; + + /* Check and truncate the limit if necessary. */ limit = (b&0xf0000) | (a&0xffff); limit++; /* We add one because limit is inclusive. */ if ( (b & _SEGMENT_G) ) limit <<= 12; if ( ((base + limit) <= base) || ((base + limit) > PAGE_OFFSET) ) - goto bad; + { + /* Need to truncate. Calculate and poke a best-effort limit. */ + limit = PAGE_OFFSET - base; + if ( (b & _SEGMENT_G) ) + limit >>= 12; + limit--; + d[0] &= ~0x0ffff; d[0] |= limit & 0x0ffff; + d[1] &= ~0xf0000; d[1] |= limit & 0xf0000; + } good: return 1; @@ -275,7 +287,7 @@ long set_gdt(struct domain *d, mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); SET_GDT_ADDRESS(d, GDT_VIRT_START); - SET_GDT_ENTRIES(d, (entries*8)-1); + SET_GDT_ENTRIES(d, entries); return 0; @@ -311,11 +323,14 @@ long do_set_gdt(unsigned long *frame_list, unsigned int entries) long do_update_descriptor( unsigned long pa, unsigned long word1, unsigned long word2) { - unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT; + unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT, d[2]; struct pfn_info *page; long ret = -EINVAL; - if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) ) + d[0] = word1; + d[1] = word2; + + if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(d) ) return -EINVAL; page = &frame_table[pfn]; @@ -346,8 +361,7 @@ long do_update_descriptor( /* All is good so make the update. */ gdt_pent = map_domain_mem(pa); - gdt_pent[0] = word1; - gdt_pent[1] = word2; + memcpy(gdt_pent, d, 8); unmap_domain_mem(gdt_pent); put_page_type(page); diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h index 780f9c8728..0e2967c4f8 100644 --- a/xen/include/asm-x86/desc.h +++ b/xen/include/asm-x86/desc.h @@ -32,6 +32,7 @@ #define _SEGMENT_S ( 1<<12) /* System descriptor (yes iff S==0) */ #define _SEGMENT_DPL ( 3<<13) /* Descriptor Privilege Level */ #define _SEGMENT_P ( 1<<15) /* Segment Present */ +#define _SEGMENT_DB ( 1<<22) /* 16- or 32-bit segment */ #define _SEGMENT_G ( 1<<23) /* Granularity */ #ifndef __ASSEMBLY__ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index b8a4c5e496..c5e809b15c 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -274,7 +274,7 @@ static inline int get_page_and_type(struct pfn_info *page, ASSERT(((_p)->count_and_flags & PGC_count_mask) != 0); \ ASSERT((_p)->u.domain == (_d)) -int check_descriptor(unsigned long a, unsigned long b); +int check_descriptor(unsigned long *d); /* * Use currently-executing domain's pagetables on the specified CPUs. @@ -298,7 +298,7 @@ extern unsigned long *machine_to_phys_mapping; /* Part of the domain API. */ int do_mmu_update(mmu_update_t *updates, int count, int *success_count); -#define DEFAULT_GDT_ENTRIES ((LAST_RESERVED_GDT_ENTRY*8)+7) +#define DEFAULT_GDT_ENTRIES (LAST_RESERVED_GDT_ENTRY+1) #define DEFAULT_GDT_ADDRESS ((unsigned long)gdt_table) #ifdef MEMORY_GUARD diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 11060e3c49..13b81f709a 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -345,6 +345,20 @@ long set_fast_trap(struct domain *p, int idx); #endif /* __x86_64__ */ +#define GTBF_TRAP 1 +#define GTBF_TRAP_NOCODE 2 +#define GTBF_TRAP_CR2 4 +struct guest_trap_bounce { + unsigned long error_code; /* 0 */ + unsigned long cr2; /* 4 */ + unsigned short flags; /* 8 */ + unsigned short cs; /* 10 */ + unsigned long eip; /* 12 */ +}; +extern struct guest_trap_bounce guest_trap_bounce[]; + +extern int gpf_emulate_4gb(struct pt_regs *regs); + struct mm_struct { /* * Every domain has a L1 pagetable of its own. Per-domain mappings @@ -401,10 +415,10 @@ static inline void write_ptbase(struct mm_struct *mm) } /* Convenient accessor for mm.gdt. */ -#define SET_GDT_ENTRIES(_p, _e) ((*(u16 *)((_p)->mm.gdt + 0)) = (_e)) +#define SET_GDT_ENTRIES(_p, _e) ((*(u16 *)((_p)->mm.gdt + 0)) = (((_e)<<3)-1)) #define SET_GDT_ADDRESS(_p, _a) ((*(unsigned long *)((_p)->mm.gdt + 2)) = (_a)) -#define GET_GDT_ENTRIES(_p) ((*(u16 *)((_p)->mm.gdt + 0))) -#define GET_GDT_ADDRESS(_p) ((*(unsigned long *)((_p)->mm.gdt + 2))) +#define GET_GDT_ENTRIES(_p) (((*(u16 *)((_p)->mm.gdt + 0))+1)>>3) +#define GET_GDT_ADDRESS(_p) (*(unsigned long *)((_p)->mm.gdt + 2)) void destroy_gdt(struct domain *d); long set_gdt(struct domain *d, diff --git a/xen/include/xen/perfc_defn.h b/xen/include/xen/perfc_defn.h index 40d3b96687..4868d2a87c 100644 --- a/xen/include/xen/perfc_defn.h +++ b/xen/include/xen/perfc_defn.h @@ -1,4 +1,6 @@ +PERFCOUNTER_CPU (emulations, "instructions emulated" ) + PERFCOUNTER_CPU( irqs, "#interrupts" ) PERFCOUNTER_CPU( ipis, "#IPIs" ) PERFCOUNTER_CPU( irq_time, "cycles spent in irq handler" ) -- 2.30.2